Introdução

Conjunto de dados utilizado

Análise exploratória de dados do IMDB sobre seriados de TV e Streaming. Os dados originais e as variáveis vêm deste repositorio . Lá consta a explicação de como os dados foram gerados e do significado de cada variável.

episodes <- read_csv(here("data/series_from_imdb.csv"), 
                    progress = FALSE,
                    col_types = cols(.default = col_double(), 
                                     series_name = col_character(), 
                                     episode = col_character(), 
                                     url = col_character(),
                                     season = col_character())) %>% 
    filter(series_name %in% c("Game of Thrones","Xena a Princesa Guerreira")) 
episodes %>% 
    glimpse()
Observations: 194
Variables: 18
$ series_name <chr> "Xena a Princesa Guerreira", "Xena a Princesa Guerreira", "Xena a Prin...
$ episode     <chr> "Sins of the Past", "Chariots of War", "Dreamworker", "Cradle of Hope"...
$ series_ep   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,...
$ season      <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", ...
$ season_ep   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,...
$ url         <chr> "http://www.imdb.com/title/tt0394990/", "http://www.imdb.com/title/tt0...
$ user_rating <dbl> 7.9, 7.4, 7.7, 7.4, 7.5, 7.7, 7.5, 8.0, 7.8, 7.6, 7.1, 7.2, 6.9, 8.2, ...
$ user_votes  <dbl> 440, 339, 318, 297, 288, 282, 270, 303, 278, 287, 271, 269, 271, 266, ...
$ r1          <dbl> 0.003623188, 0.025641026, 0.064516129, 0.023474178, 0.003759398, 0.040...
$ r2          <dbl> 0.04347826, 0.03846154, 0.03548387, 0.02347418, 0.02255639, 0.04054054...
$ r3          <dbl> 0.010869565, 0.038461538, 0.029032258, 0.004694836, 0.000000000, 0.009...
$ r4          <dbl> 0.007246377, 0.034188034, 0.022580645, 0.023474178, 0.003759398, 0.018...
$ r5          <dbl> 0.018115942, 0.042735043, 0.019354839, 0.046948357, 0.045112782, 0.022...
$ r6          <dbl> 0.02536232, 0.12393162, 0.03870968, 0.06103286, 0.12781955, 0.04954955...
$ r7          <dbl> 0.08695652, 0.16239316, 0.05161290, 0.11267606, 0.13909774, 0.07657658...
$ r8          <dbl> 0.1086957, 0.1666667, 0.1354839, 0.2112676, 0.1804511, 0.1171171, 0.08...
$ r9          <dbl> 0.15579710, 0.09829060, 0.17096774, 0.18309859, 0.13909774, 0.22072072...
$ r10         <dbl> 0.5398551, 0.2692308, 0.4322581, 0.3098592, 0.3383459, 0.4054054, 0.44...

Episódios da metade da temporada

A título de tornar nossa discussão mais interessante vamos gerar uma nova informação : “Um episódio faz parte da metade da temporada?” (middle_eps). Um episódio é da metade da temporada se está entre os 60% dos episódios centrais de uma temporada.

sumario_simples <- 
    episodes %>% 
    select(season_ep,season,series_name) %>%
    group_by(series_name,season) %>% 
    summarise(n = n(),
               p20 = quantile(seq(from=1, to=n, by=1), 0.20),
               p80 = quantile(seq(from=1, to=n, by=1), 0.80))
episodes <- left_join(episodes, sumario_simples,
                      by = c("series_name", "season")) %>% 
    group_by(series_name, season) %>%
    mutate(middle_eps = (season_ep > p20) &
               (season_ep < p80)) %>% 
    ungroup()
episodes %>% 
    select(series_name, series_ep, middle_eps)

E aí, quem se saiu melhor?

m <- list(
  b = 100,
  r = 200,
  t = 75
  )
p <- episodes %>% 
      ggplot(aes(x = series_name, y = user_rating, 
                 color=middle_eps,
                 group=episode)) + 
        geom_jitter(width = 0.3, alpha=0.7) +
        facet_wrap(~ season) +
        xlab("") +
        ylab("Votação do Usuário") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1))  +
        scale_x_discrete(labels=c("GOT", "Xena")) +
        labs(color='Metade da temporada?') +
        ggtitle(paste("GOT x Xena (Temporada a Temporada)")) +
        theme_update(plot.title = element_text(hjust = -1))
ggplotly(p, tooltip = c("episode", "x","y")) %>%
  layout(autosize = F, margin = m)
LS0tCnRpdGxlOiAiRURBIEludGVyYXRpdmEgc29icmUgc8OpcmllcyBubyBJTURCIgpzdWJ0aXRsZTogJycKYXV0aG9yOiAiSm9zw6kgQmVuYXJkaSBkZSBTb3V6YSBOdW5lcyIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICB0b2M6IHllcwogICAgdG9jX2Zsb2F0OiB5ZXMKICBodG1sX2RvY3VtZW50OgogICAgZGZfcHJpbnQ6IHBhZ2VkCiAgICB0b2M6IHllcwogICAgdG9jX2Zsb2F0OiB5ZXMKLS0tCgo8YnI+PC9icj4KCiMjIEludHJvZHXDp8OjbwoKIyMjIENvbmp1bnRvIGRlIGRhZG9zIHV0aWxpemFkbwoKPiBBbsOhbGlzZSBleHBsb3JhdMOzcmlhIGRlIGRhZG9zIGRvIFtJTURCXShodHRwczovL3d3dy5pbWRiLmNvbS8pICBzb2JyZSBzZXJpYWRvcyBkZSBUViBlIFN0cmVhbWluZy4gT3MgZGFkb3Mgb3JpZ2luYWlzIGUgYXMgdmFyacOhdmVpcyB2w6ptIFtkZXN0ZSByZXBvc2l0b3Jpb10oaHR0cHM6Ly9naXRodWIuY29tL25hemFyZW5vL2ltZGItc2VyaWVzKSAuIEzDoSBjb25zdGEgYSBleHBsaWNhw6fDo28gZGUgY29tbyBvcyBkYWRvcyBmb3JhbSBnZXJhZG9zIGUgZG8gc2lnbmlmaWNhZG8gZGUgY2FkYSB2YXJpw6F2ZWwuCgpgYGB7ciBzZXR1cCwgZWNobz1GQUxTRSwgd2FybmluZz1GQUxTRSwgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoaGVyZSkKbGlicmFyeShwbG90bHkpCnRoZW1lX3NldCh0aGVtZV9idygpKQpgYGAKCmBgYHtyfQplcGlzb2RlcyA8LSByZWFkX2NzdihoZXJlKCJkYXRhL3Nlcmllc19mcm9tX2ltZGIuY3N2IiksIAogICAgICAgICAgICAgICAgICAgIHByb2dyZXNzID0gRkFMU0UsCiAgICAgICAgICAgICAgICAgICAgY29sX3R5cGVzID0gY29scyguZGVmYXVsdCA9IGNvbF9kb3VibGUoKSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBzZXJpZXNfbmFtZSA9IGNvbF9jaGFyYWN0ZXIoKSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBlcGlzb2RlID0gY29sX2NoYXJhY3RlcigpLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHVybCA9IGNvbF9jaGFyYWN0ZXIoKSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlYXNvbiA9IGNvbF9jaGFyYWN0ZXIoKSkpICU+JSAKICAgIGZpbHRlcihzZXJpZXNfbmFtZSAlaW4lIGMoIkdhbWUgb2YgVGhyb25lcyIsIlhlbmEgYSBQcmluY2VzYSBHdWVycmVpcmEiKSkgCgplcGlzb2RlcyAlPiUgCiAgICBnbGltcHNlKCkKYGBgCgoqKioqCgojIyMgRXBpc8OzZGlvcyBkYSBtZXRhZGUgZGEgdGVtcG9yYWRhCgo+QSB0w610dWxvIGRlIHRvcm5hciBub3NzYSBkaXNjdXNzw6NvIG1haXMgaW50ZXJlc3NhbnRlIHZhbW9zIGdlcmFyIHVtYSBub3ZhIGluZm9ybWHDp8OjbyA6ICJVbSBlcGlzw7NkaW8gZmF6IHBhcnRlIGRhIG1ldGFkZSBkYSB0ZW1wb3JhZGE/IiAoKm1pZGRsZV9lcHMqKS4gVW0gZXBpc8OzZGlvIMOpIGRhIG1ldGFkZSBkYSB0ZW1wb3JhZGEgc2UgZXN0w6EgZW50cmUgb3MgNjAlIGRvcyBlcGlzw7NkaW9zIGNlbnRyYWlzIGRlIHVtYSB0ZW1wb3JhZGEuICAKCmBgYHtyfQpzdW1hcmlvX3NpbXBsZXMgPC0gCiAgICBlcGlzb2RlcyAlPiUgCiAgICBzZWxlY3Qoc2Vhc29uX2VwLHNlYXNvbixzZXJpZXNfbmFtZSkgJT4lCiAgICBncm91cF9ieShzZXJpZXNfbmFtZSxzZWFzb24pICU+JSAKICAgIHN1bW1hcmlzZShuID0gbigpLAogICAgICAgICAgICAgICBwMjAgPSBxdWFudGlsZShzZXEoZnJvbT0xLCB0bz1uLCBieT0xKSwgMC4yMCksCiAgICAgICAgICAgICAgIHA4MCA9IHF1YW50aWxlKHNlcShmcm9tPTEsIHRvPW4sIGJ5PTEpLCAwLjgwKSkKCmVwaXNvZGVzIDwtIGxlZnRfam9pbihlcGlzb2Rlcywgc3VtYXJpb19zaW1wbGVzLAogICAgICAgICAgICAgICAgICAgICAgYnkgPSBjKCJzZXJpZXNfbmFtZSIsICJzZWFzb24iKSkgJT4lIAogICAgZ3JvdXBfYnkoc2VyaWVzX25hbWUsIHNlYXNvbikgJT4lCiAgICBtdXRhdGUobWlkZGxlX2VwcyA9IChzZWFzb25fZXAgPiBwMjApICYKICAgICAgICAgICAgICAgKHNlYXNvbl9lcCA8IHA4MCkpICU+JSAKICAgIHVuZ3JvdXAoKQplcGlzb2RlcyAlPiUgCiAgICBzZWxlY3Qoc2VyaWVzX25hbWUsIHNlcmllc19lcCwgbWlkZGxlX2VwcykKYGBgCgoqKioKCiMjIyBFIGHDrSwgcXVlbSBzZSBzYWl1IG1lbGhvcj8KCgpgYGB7ciwgd2FybmluZz1GQUxTRSwgbWVzc2FnZT1GQUxTRX0KbSA8LSBsaXN0KAogIGIgPSAxMDAsCiAgciA9IDIwMCwKICB0ID0gNzUKICApCgpwIDwtIGVwaXNvZGVzICU+JSAKICAgICAgZ2dwbG90KGFlcyh4ID0gc2VyaWVzX25hbWUsIHkgPSB1c2VyX3JhdGluZywgCiAgICAgICAgICAgICAgICAgY29sb3I9bWlkZGxlX2VwcywKICAgICAgICAgICAgICAgICBncm91cD1lcGlzb2RlKSkgKyAKICAgICAgICBnZW9tX2ppdHRlcih3aWR0aCA9IDAuMywgYWxwaGE9MC43KSArCiAgICAgICAgZmFjZXRfd3JhcCh+IHNlYXNvbikgKwogICAgICAgIHhsYWIoIiIpICsKICAgICAgICB5bGFiKCJWb3Rhw6fDo28gZG8gVXN1w6FyaW8iKSArCiAgICAgICAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA5MCwgaGp1c3QgPSAxKSkgICsKICAgICAgICBzY2FsZV94X2Rpc2NyZXRlKGxhYmVscz1jKCJHT1QiLCAiWGVuYSIpKSArCiAgICAgICAgbGFicyhjb2xvcj0nTWV0YWRlIGRhIHRlbXBvcmFkYT8nKSArCiAgICAgICAgZ2d0aXRsZShwYXN0ZSgiR09UIHggWGVuYSAoVGVtcG9yYWRhIGEgVGVtcG9yYWRhKSIpKSArCiAgICAgICAgdGhlbWVfdXBkYXRlKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoaGp1c3QgPSAtMSkpCgpnZ3Bsb3RseShwLCB0b29sdGlwID0gYygiZXBpc29kZSIsICJ4IiwieSIpKSAlPiUKICBsYXlvdXQoYXV0b3NpemUgPSBGLCBtYXJnaW4gPSBtKQpgYGAKCgo=